import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
file ='data/singapore-residents-by-age-group-ethnic-group-and-sex-end-june-annual.csv'
data = pd.read_csv(file)
data.head()
# change col names
data.rename(columns = {'level_1':'Ethnic_gender',
'level_2':'Age_group'},inplace =True)
data.head()
data.Ethnic_gender.unique()
m_f = {}
# N total
# M male
# F Female
for i in data.Ethnic_gender.unique():
print(f""" '{i}':'M', """)
m_f = {
'Total Residents':'N',
'Total Male Residents':'M',
'Total Female Residents':'F',
'Total Malays':'N',
'Total Male Malays':'M',
'Total Female Malays':'F',
'Total Chinese':'N',
'Total Male Chinese':'M',
'Total Female Chinese':'F',
'Total Indians':'N',
'Total Male Indians':'M',
'Total Female Indians':'F',
'Other Ethnic Groups (Total)':'N',
'Other Ethnic Groups (Males)':'M',
'Other Ethnic Groups (Females)':'F'
}
data['gender'] = data['Ethnic_gender'].apply(lambda x: m_f[x] )
data.head()
# check data type
data.dtypes
data.isna().sum()
print(sum(data.values =='na'))
len(data.groupby(by =['year','Ethnic_gender'])['value'].count().index)
plt.figure(figsize =(30,12))
sns.heatmap(data =='na',
)
missing_values = data.iloc[data['value'].values =='na'].reset_index(drop = 'index')
missing_values.head()
data = data.iloc[~(data['value'].values =='na')].reset_index(drop=True)
data.iloc[data['value'].values =='na']
data.value = data.value.astype('int')
np.unique(data.year.values)
unique_Age_group =np.unique(data.Age_group.values)
print("No of unique_Age_group ", len(unique_Age_group))
print(unique_Age_group)
unique_Ethnic_gender =np.unique(data.Ethnic_gender.values)
print("No of unique_Ethnic_gender ", len(unique_Ethnic_gender))
print(unique_Ethnic_gender)
largest Ethnic group in Singapore : Total Residents
what proportion of the total population do they constitute :: 50 %
average population growth over the years :: 0.02055817990465972
def largest_group(data,value):
return data.groupby(by =value)['value'].sum().sort_values( ascending=False)
Ethnic_group = largest_group(data[data['gender']=='N'],'Ethnic_gender')
Ethnic_group
fig = px.pie(values=Ethnic_group.values, names=Ethnic_group.index,
title='Population by ethnicity')
fig.show()
print('what proportion of the total population do they constitute ::50 %')
growth_data = data[(data['Ethnic_gender'] == Ethnic_group.index[0])]
growth_data = growth_data.groupby(by=['year'])['value'].sum().pct_change()
fig = px.line(y =growth_data.values,
x =growth_data.index,
title=f"Growth rate yearly For {Ethnic_group.index[0]}"
)
fig.show()
print('average population growth over the years :: ', growth_data.values[1:].mean())
# growth rate by age
print('growth rate by age for ' ,Ethnic_group.index[0])
fig, ax = plt.subplots(nrows=11, ncols=2, figsize =(20,20))
growth_rate = []
growth_data = data[(data['Ethnic_gender'] == Ethnic_group.index[0])]
for i,age_g_2 in enumerate(unique_Age_group.reshape(11,2)):
for j, age_g_2_1 in enumerate(age_g_2):
growth_data_by_Age_group =growth_data[(growth_data['Age_group'] == age_g_2_1)].groupby(
by=['year'])['value'].sum().pct_change()
y = growth_data_by_Age_group.values
growth_rate.append((y[1:].mean(),age_g_2_1))
x = growth_data_by_Age_group.index
ax[i,j].plot( x,y)
ax[i,j].set_title(age_g_2_1)
plt.tight_layout()
plt.show()
""" insight
Life expectancy incresed and
decrease in number of new born child
"""
plt.figure(figsize=(12,10))
growth_rate.sort(key = lambda x : x[0],reverse=True)
x, y = zip(*(growth_rate))
sns.barplot(y=list(y),
x=list(x))
largest age group in Singapore :: 5 - 9 Years
what proportion of the total population do they constitute :: 7.56 %
average population growth over the years :: -0.0011038524078855348
Ethnic_groups_name = [i[0] for i in m_f.items() if i[1] == 'N']
Ethnic_groups_name
Ethnic_groups =data[data['gender'] == 'N']
Ethnic_groups
Age_group = largest_group(Ethnic_groups, 'Age_group')
Age_group
# 5 - 9 Years largest
fig = px.pie(values=Age_group.values, names=Age_group.index,
title='Population by Age')
fig.show()
growth_data = data[(data['Age_group'] == Age_group.index[0])]
growth_data = growth_data.groupby(by=['year'])['value'].sum().pct_change()
fig = px.line(y =growth_data.values,
x =growth_data.index,
title=f"Growth rate yearly For {Age_group.index[0]}"
)
fig.update_xaxes(rangeslider_visible=True)
fig.show()
print(growth_data.values[1:].mean())
# growth rate by age
growth_rate = []
fig, ax = plt.subplots(nrows=11, ncols=2, figsize =(20,20))
growth_data = Ethnic_groups
for i,u_l_2_c in enumerate(np.array(Age_group.index).reshape(11,2)):
for j, u_l_2 in enumerate(u_l_2_c):
growth_data_by_level_2 =growth_data[(growth_data['Age_group'] == u_l_2)].groupby(
by=['year'])['value'].sum().pct_change()
y = growth_data_by_level_2.values
x = growth_data_by_level_2.index
growth_rate.append((y[1:].mean(),u_l_2))
ax[i,j].plot( x,y)
ax[i,j].set_title(u_l_2)
plt.tight_layout()
plt.show()
""" insight
Life expectancy increse and
decrease in number of new born child
"""
plt.figure(figsize=(12,10))
growth_rate.sort(key = lambda x : x[0],reverse=True)
x, y = zip(*(growth_rate))
sns.barplot(y=list(y),
x=list(x))
Has shown the highest growth rate :: (0.4044422424858464, 'Total Male Chinese85 Years & Over')
Has shown the lowest growth rate :: (-0.007212427829619562, 'Total Male Chinese 0 - 4 Years')
Has remained the sames :: appox estimatimation (0.0013961680698216085, 'Total Female Malays 5 - 9 Years')
eth_age_gen = data[~(data['gender']=='N')]
eth_age_gen
eth_age_gen.Ethnic_gender.unique()
""" 220 unique index"""
eth_age_gen.groupby(by = ['Ethnic_gender','Age_group'])['year'].count()
growth_rate = [] # value and group
for i,group_i in enumerate(eth_age_gen.groupby(by = ['Ethnic_gender','Age_group'])['value','year']):
growth_rate.append((group_i[1]['value'].pct_change()[1:].values.mean(),
''.join(group_i[0])
))
growth_rate.sort(key=lambda x: x[0],reverse=True)
print('highest growth rate :', growth_rate[0])
print('lowest growth rate :', growth_rate[-1])
print('remained growth same (Almost) :', [i for i in growth_rate if i[0]>=0][-1])
plt.figure(figsize=(12,40))
growth_rate.sort(key = lambda x : x[0],reverse=True)
x, y = zip(*(growth_rate))
sns.barplot(y=list(y),
x=list(x))
trends = data[data['gender'] == 'N']
trends.head(1)
plt.figure(figsize=(15,10))
sns.lineplot(x ='year',
y= 'value',
hue ='Ethnic_gender',
data = trends,
)
fig , ax = plt.subplots(nrows=5,ncols =1,figsize =(12,10))
for i,(ethenity,population_year) in enumerate(trends.groupby(by = 'Ethnic_gender')['value','year']):
sns.lineplot(y=population_year.value.values,
x=population_year.year.values,ax=ax[i])
ax[i].set_title(ethenity)
plt.tight_layout()
plt.show()
fig , ax = plt.subplots(nrows=11,ncols =2,figsize =(20,44))
for i,age in enumerate(unique_Age_group.reshape(11,2)):
for j, age_i in enumerate(age):
sns.lineplot(y=trends[trends['Age_group'] == age_i]['value'].values,
x=trends[trends['Age_group'] == age_i]['year'].values,
ax = ax[i,j]
)
ax[i,j].set_title(age_i)
plt.tight_layout()
plt.show()
trends = data[~(data['gender']=='N')]
plt.figure(figsize=(15,10))
sns.lineplot(x ='year',
y= 'value',
hue ='gender',
data = trends,
)
#checking growth rate
trends = data[~(data['gender']=='N')]
growth_rate_m = trends[trends['gender']=='M'].groupby('year').sum().pct_change()[1:]
growth_rate_f = trends[trends['gender']=='F'].groupby('year').sum().pct_change()[1:]
sns.lineplot(y =growth_rate_m.value.values,
x = list(growth_rate_m.index.values),
color='g'
)
sns.lineplot(y =growth_rate_f.value.values,
x = list(growth_rate_f.index.values),
color='b'
)